# --- 1. Load Package and Dependencies ---
devtools::install_github("wqu-nd/finetuneR")
library(finetuneR)
library(dplyr)
library(reticulate)
library(purrr)

# --- 2. Setup Python Environment ---
reticulate::use_miniconda("r-reticulate", required = TRUE)

#setup
setup_finetuner_env(global_seed = 123)


# --- 3. Load and Prepare Data ---
# Use the original 15-label mapping
label_map <- data.frame(
  category = c(
    "Monitoring environmental impact", "Preventing pollution",
    "Strengthening ecosystems", "Reducing use", "Reusing", "Recycling",
    "Repurposing", "Encouraging and supporting others",
    "Educating and training for sustainability",
    "Creating sustainable products and processes",
    "Embracing innovation for sustainability", "Changing how work is done",
    "Choosing responsible alternatives", "Instituting programs and policies",
    "Others"
  ),
  label = 0:14
)

example_c = read.csv("German_French_UK_China_MENA_040420.csv") #please download the data first
my_data = example_c%>%
  select(ID, description_behavior, Specific_cat)%>%
  mutate(
    Specific_cat = if_else(
      Specific_cat %in% label_map$category,
      Specific_cat,
      "Others"
    )
  )%>%
  left_join(label_map,by=c("Specific_cat" = "category"))%>%
  select(ID, text=description_behavior,label=label)

cat("--- Data Preview ---\n")
print(head(my_data))
cat("\nNumber of labels:", n_distinct(my_data$label), "\n")


# --- 4. Set Model and Training Parameters ---
MODEL_NAME <- "distilroberta-base"
NUM_LABELS <- n_distinct(my_data$label)
OUTPUT_DIR <- "./finetuneR-classification-results"


# --- 5. Run Training with Multiple Seeds ---
all_run_results <- list()
all_data_splits <- list()
n_runs <- 3

for (i in 1:n_runs) {
  run_seed <- 11 * i
  cat(paste0("\nExecuting Run ", i, "/", n_runs, " (Seed: ", run_seed, ")..."))

  # a. Prepare data. The function now returns both datasets and data frames
  data_preparation_output <- prepare_finetuning_data(
    df = my_data,
    task_type = "classification",
    model_name = MODEL_NAME,
    seed = run_seed
  )

  # Store the raw data splits for later
  all_data_splits[[paste0("run_", i)]] <- data_preparation_output$data_splits

  # b. Create training arguments
  training_args <- create_training_args(
    output_dir = file.path(OUTPUT_DIR, paste0("run_", i)),
    num_train_epochs = 10,
    per_device_train_batch_size = 64,
    per_device_eval_batch_size = 128,
    learning_rate = 2e-5,
    warmup_steps=0,
    weight_decay=0.05,
    task_type = "classification",
    metric_for_best_model = "precision",
    seed = 11
  )

  # c. Run the fine-tuning process, passing only the tokenized datasets
  all_run_results[[paste0("run_", i)]] <- finetune_model(
    datasets = data_preparation_output$datasets,
    task_type = "classification",
    model_name = MODEL_NAME,
    training_args = training_args,
    num_labels = NUM_LABELS
  )
}
cat("\n\nAll training runs complete.\n")


# --- 6. Save Data Splits to CSV for Reproducibility ---
data_split_dir <- "Your own data direction" # please set the direction for store the data
if (!dir.exists(data_split_dir)) {
  dir.create(data_split_dir)
}

for (run_name in names(all_data_splits)) {
  write.csv(
    all_data_splits[[run_name]]$train,
    file.path(data_split_dir, paste0(run_name, "_train.csv")),
    row.names = FALSE
  )
  write.csv(
    all_data_splits[[run_name]]$validation,
    file.path(data_split_dir, paste0(run_name, "_validation.csv")),
    row.names = FALSE
  )
  write.csv(
    all_data_splits[[run_name]]$test,
    file.path(data_split_dir, paste0(run_name, "_test.csv")),
    row.names = FALSE
  )
}
cat(paste("\nData splits for all runs saved to the '", data_split_dir, "' directory.\n", sep=""))


# --- 6. Generate Comprehensive Final Report ---
rst_all <- summarize_run_results(
  all_run_results = all_run_results,
  task_type = "classification",
  label_map = label_map
)


